Machine Learning Introduction


In [ ]:
from IPython.display import Image, display, HTML
Image("images/munich.jpg")

In [ ]:
display(HTML("<table><tr><td><p><b>Rain Princess - Leonid Afremov</b></p><img src='images/princess.jpeg'></td><td><b><p>Munich + Rain Princess + Machine Learning</b></p><img src='images/munich-princess-out.jpg'></td></tr></table>"))

In [ ]:
display(HTML("<table><tr><td><p><b>The Great Wave off Kanagawa - Katsushika Hokusai</b></p><img src='images/wave.jpg'></td><td><b><p>Munich + The Great Wave + Machine Learning</b></p><img src='images/munich-wave-out.jpg'></td></tr></table>"))

In [ ]:
display(HTML("<table><tr><td><p><b>La Muse - Pablo Picaso</b></p><img src='images/muse.jpg'></td><td><b><p>Munich + La Muse + Machine Learning</b></p><img src='images/munich-muse-out.jpg'></td></tr></table>"))

In [ ]:
display(HTML("<table><tr><td><p><b>Udnie - Francis Picabia</b></p><img src='images/udnie.jpg'></td><td><b><p>Munich + Udnie + Machine Learning</b></p><img src='images/munich-udnie-out.jpg'></td></tr></table>"))

In [ ]:
display(HTML("<table><tr><td><b><p>Scream - Edvard Munch</b></p><img src='images/scream.jpg'></td><td><b><p>Munich + Scream + Machine Learning</b></p><img src='images/munich-scream-out.jpg'></td></tr></table>"))

In [ ]:
display(HTML("<table><tr><td><p><b>The Shipwreck of the Minotaur - Joseph Mallord William Turner</b></p><img src='images/wreck.jpg'></td><td><b><p>Munich + Shipwreck + Machine Learning</b></p><img src='images/munich-wreck-out.jpg'></td></tr></table>"))

In [ ]:
# A bit about MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
data = input_data.read_data_sets("data/MNIST/", one_hot=True)
import numpy as np
from scipy.stats import norm
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
import seaborn as sns

In [ ]:
%matplotlib inline
data.test.cls = np.array([label.argmax() for label in data.test.labels])
# We know that MNIST images are 28 pixels in each dimension.
img_size = 28
# Images are stored in one-dimensional arrays of this length.
img_size_flat = img_size * img_size
# Tuple with height and width of images used to reshape arrays.
img_shape = (img_size, img_size)
# Number of classes, one class for each of 10 digits.
num_classes = 10
def plot_images(images, cls_true, cls_pred=None):
    assert len(images) == len(cls_true) == 9
    
    # Create figure with 3x3 sub-plots.
    fig, axes = plt.subplots(3, 3)
    fig.subplots_adjust(hspace=0.3, wspace=0.3)

    for i, ax in enumerate(axes.flat):
        # Plot image.
        ax.imshow(images[i].reshape(img_shape), cmap='binary')

        # Show true and predicted classes.
        if cls_pred is None:
            xlabel = "True: {0}".format(cls_true[i])
        else:
            xlabel = "True: {0}, Pred: {1}".format(cls_true[i], cls_pred[i])

        ax.set_xlabel(xlabel)
        
        # Remove ticks from the plot.
        ax.set_xticks([])
        ax.set_yticks([])
# Get the first images from the test-set.
images = data.test.images[0:9]
# Get the true classes for those images.
cls_true = data.test.cls[0:9]
# Plot the images and labels using our helper-function above.
plot_images(images=images, cls_true=cls_true)

Python basics

Variables


In [ ]:
# String
string = 'Machine learning '
string2 = ' dojo '
string3 = ' part I'
print string + string2 + string3
print 'String variable type is: {}'.format(type(string))

In [ ]:
# Integers
number = 10
number2 = 20
number3 = 30
print number + number2 + number3
print 'number variable type is: {}'.format(type(number))

In [ ]:
# Booleans
boolean = True
boolean2 = True
boolean3 = False
print boolean and boolean2 or boolean3
print 'bolean variable type is: {}'.format(type(boolean))

In [ ]:
# Floating point numbers
floating = 3.14
floating2 = 2.79
floating3 = 10.01
print floating + floating2 + floating3
print 'floating variable type is: {}'.format(type(floating))

Conditional statements


In [ ]:
if 10 > 8:
    print '10 is greater than 8.'
    print '10 is greater than 8.'
    print '10 is greater than 8.'

In [ ]:
a = True
b = 10
c = 20
print 'first if statement...'
if b < c and a:
    print 'All fine.'
else:
    print 'Not all fine.'

print 'second if statement...'
if b < c and (not a):
    print 'All fine.'
else:
    print 'Not all fine.'

In [ ]:
if 10 > 20:
    message = "if only 10 were greater than 20"
elif 10 > 30:
    message = "elif means 'else if'"
else:
    message = "when all else fails use else " 
message

Loops


In [ ]:
for i in [1, 2, 3, 4, 5]:
    print i

In [ ]:
for x in range(5):
    if x == 3:
        continue  # go immediately to the next iteration
    if x == 5:
        break     # quit the loop entirely
    print x

In [ ]:
x = 0
while x < 5:
    print x, "is less than 5"
    x += 1

In [ ]:
a = True
x = 0
while a:
    print x, "is less than 10"
    x += 1
    if x >= 10:
        a = False

Data structures


In [215]:
# Lists
numbers = [1, 4, 9, 16, 25]

In [ ]:
numbers[:]

In [ ]:
numbers[:2]

In [ ]:
numbers[2:]

In [ ]:
type(numbers)

In [ ]:
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
len(letters)

In [ ]:
letters[2]

In [ ]:
a = [66.25, 333, 333, 1, 1234.5]
a

In [ ]:
a.count(333), a.count(66.25), a.count('x')

In [ ]:
a.insert(2, -1)
a

In [ ]:
a.append(333)
a

In [ ]:
a.index(333)

In [ ]:
a.remove(333)
a

In [ ]:
a.reverse()
a

In [ ]:
a.sort()
a

In [ ]:
a.pop()

In [ ]:
a

In [232]:
# dictionaires
phones = {'Spiderman': 151984858, 'Me': 151234324}

In [ ]:
phones['Superman'] = 15104928
phones

In [ ]:
phones['Spiderman']

In [ ]:
del phones['Me']
phones

In [ ]:
phones['Batman'] = 15123545
phones

In [ ]:
phones.keys()

In [ ]:
'Ken' in phones

In [215]:
# tuples

In [ ]:
tuple = 31213, 123453, 'hi Ml!'
tuple

In [ ]:
tuple[0]

In [ ]:
tuple[2]

In [ ]:
tuple[1] = 1234

In [ ]:
tupleTheSecond = tuple, (1, 2, 3, 4, 5)
tupleTheSecond

In [244]:
t1, t2 = tupleTheSecond

In [ ]:
t1

In [ ]:
t2

In [ ]:
for i, j in zip (t1, t2):
    print i, j

In [ ]:
type(t1)

In [ ]:
# sets

In [249]:
basket = ['apple', 'orange', 'apple', 'pear', 'orange', 'banana']

In [250]:
fruit = set(basket)

In [ ]:
fruit

In [ ]:
'orange' in fruit

In [ ]:
'plum' in fruit

Pandas basics


In [254]:
# import panda library 
import pandas as pd

In [ ]:
# Show version of panda library
print pd.__version__

In [ ]:
# it is all about describing the data
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

def randrange(n, vmin, vmax):
    '''
    Helper function to make an array of random numbers having shape (n, )
    with each number distributed Uniform(vmin, vmax).
    '''
    return (vmax - vmin)*np.random.rand(n) + vmin

fig = plt.figure(figsize=(14, 12))
ax = fig.add_subplot(111, projection='3d')

n = 100

# For each set of style and range settings, plot n random points in the box
# defined by x in [23, 32], y in [0, 100], z in [zlow, zhigh].
for c, m, zlow, zhigh in [('r', 'o', -50, -25), ('b', '^', -30, -5)]:
    xs = randrange(n, 23, 32)
    ys = randrange(n, 0, 100)
    zs = randrange(n, zlow, zhigh)
    ax.scatter(xs, ys, zs, c=c, marker=m)

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')

plt.show()

House Sales in King County, USA

Dataset features are selfexplanatory. Dataset is taken from Kaggle website


In [257]:
# read csv file
nn = pd.read_csv('kc_house_data.csv')

In [ ]:
# top 5 data records
nn.head(10)

In [ ]:
# check are there any null values in any of the columns
nn.isnull().any()

len(nn)

In [262]:
# add one record with NaN values
nn = nn.append({'id':'12345', 'price':'12345.23'}, ignore_index=True)

In [ ]:
len(nn)

In [ ]:
# check number of NaN values in some column
len(nn[nn.bedrooms.isnull()])

In [ ]:
# show list of the records where column bedrooms contain NaN values
nn[nn.bedrooms.isnull()]

In [266]:
# drop NaN values
nn = nn.dropna()

In [ ]:
# check number of NaN records after droping NaNs 
len(nn[nn.bedrooms.isnull()])

len(nn)

In [ ]:
nn.describe()

Adding new columns


In [ ]:
foot_to_meter_ratio = 0.092903
nn['sqm2_living']=nn['sqft_living'] * foot_to_meter_ratio
nn['sqm2_living'] = nn['sqm2_living'].round(0)

nn['sqm2_lot']=nn['sqft_lot'] * foot_to_meter_ratio
nn['sqm2_lot'] = nn['sqm2_lot'].round(0)

# show all columns
pd.set_option("display.max_columns",99)
pd.set_option("display.max_rows",999)

nn.head()

In [ ]:
nn['sqm2_basement'] = nn['sqft_basement'].map(lambda x: round(x * foot_to_meter_ratio, 0))
nn.head()

In [ ]:
nn['price_low'] = 0
condition = nn['price'] < 100000
nn.loc[condition, 'price_low'] = 1
nn.loc[~condition, 'price_low'] = 0
nn['price_low'].value_counts()

In [ ]:
new = nn[(nn['price'] < 100000)] 
new

In [ ]:
nn['bedrooms'].value_counts()

In [ ]:
counts = nn.groupby('bedrooms').size()
counts

In [ ]:
# check waterfront column values
nn['waterfront'].value_counts()

In [ ]:
# select all properties with waterfront
waterfront = nn[(nn['waterfront'] == 1)]
waterfront

In [ ]:
waterfront_1_room = nn[(nn['waterfront'] == 1) & (nn['bedrooms'] == 1)]
waterfront_1_room

In [ ]:
waterfront.describe()

Histograms - data distributions


In [303]:
plt.figure(figsize=(10, 5))
plt.hist(nn['bedrooms'],normed=False) 
plt.show()

In [ ]:
plt.figure(figsize=(10, 5))
plt.hist(nn['price'],normed=False)      
plt.show()

In [ ]:
plt.figure(figsize=(10, 5))
plt.hist(nn['sqft_living'],normed=False)      
plt.show()

In [ ]:
plt.figure(figsize=(10, 5))
plt.hist(nn['sqft_lot'],normed=False)      
plt.show()

Miscellaneous


In [ ]:
def colorFunction(x):
    if x == 0:
        return 'black'
    elif x == 1:
        return 'brown'
    elif x == 2:
        return 'red'
    elif x == 3:
        return 'blue'
    elif x == 4:
        return 'green'
    elif x == 5:
        return 'pink'
    elif x == 6:
        return 'orange'
    elif x ==7:
        return 'cyan'
    elif x ==8:
        return 'yellow'
    elif x == 9:
        return 'magenta'
    else:
        return 'pink'
    
nn['color'] = nn['bedrooms'].apply(colorFunction)

figure = plt.figure()
subplot = figure.add_subplot(111)
scatter = subplot.scatter(nn['long'], nn['lat'], s=10, c=nn['color'])
subplot.set_xlabel('Longitude')
subplot.set_ylabel('Latitude')
figure.set_figheight(10)
figure.set_figwidth(15)
plt.show()

In [ ]:
features = nn.drop(['id','price','date','color'], axis = 1)

# Using pyplot
plt.figure(figsize=(20, 55))

# i: index
for i, col in enumerate(features.columns):
    # 3 plots here hence 1, 3
    plt.subplot(10, 3, i+1)
    x = nn[col]
    y = nn['price']
    plt.plot(x, y, 'o')
    # Create regression line
    plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('prices')
plt.show()

In [ ]:
# best fit of data
(mu, sigma) = norm.fit(nn['price'])

# the histogram of the data
n, bins, patches = plt.hist(nn['price'], 60, normed=True, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf(bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)

#plot
plt.xlabel('Sales prices')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma))
plt.grid(True)

plt.show()

A bit about correlation


In [ ]:
# plot the heatmap
nn = pd.read_csv('kc_house_data.csv')
nn = nn.drop(['id'], axis=1)
plt.figure(figsize=(14, 12))
sns.heatmap(nn.corr())

In [ ]:
# showing correlations in the table

cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

nn.corr().style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())

In [ ]:

House Prices - Data fields description

Here's a brief version of what you'll find in the data description file.

  • SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
  • MSSubClass: The building class
  • MSZoning: The general zoning classification
  • LotFrontage: Linear feet of street connected to property
  • LotArea: Lot size in square feet
  • Street: Type of road access
  • Alley: Type of alley access
  • LotShape: General shape of property
  • LandContour: Flatness of the property
  • Utilities: Type of utilities available
  • LotConfig: Lot configuration
  • LandSlope: Slope of property
  • Neighborhood: Physical locations within Ames city limits
  • Condition1: Proximity to main road or railroad
  • Condition2: Proximity to main road or railroad (if a second is present)
  • BldgType: Type of dwelling
  • HouseStyle: Style of dwelling
  • OverallQual: Overall material and finish quality
  • OverallCond: Overall condition rating
  • YearBuilt: Original construction date
  • YearRemodAdd: Remodel date
  • RoofStyle: Type of roof
  • RoofMatl: Roof material
  • Exterior1st: Exterior covering on house
  • Exterior2nd: Exterior covering on house (if more than one material)
  • MasVnrType: Masonry veneer type
  • MasVnrArea: Masonry veneer area in square feet
  • ExterQual: Exterior material quality
  • ExterCond: Present condition of the material on the exterior
  • Foundation: Type of foundation
  • BsmtQual: Height of the basement
  • BsmtCond: General condition of the basement
  • BsmtExposure: Walkout or garden level basement walls
  • BsmtFinType1: Quality of basement finished area
  • BsmtFinSF1: Type 1 finished square feet
  • BsmtFinType2: Quality of second finished area (if present)
  • BsmtFinSF2: Type 2 finished square feet
  • BsmtUnfSF: Unfinished square feet of basement area
  • TotalBsmtSF: Total square feet of basement area
  • Heating: Type of heating
  • HeatingQC: Heating quality and condition
  • CentralAir: Central air conditioning
  • Electrical: Electrical system
  • 1stFlrSF: First Floor square feet
  • 2ndFlrSF: Second floor square feet
  • LowQualFinSF: Low quality finished square feet (all floors)
  • GrLivArea: Above grade (ground) living area square feet
  • BsmtFullBath: Basement full bathrooms
  • BsmtHalfBath: Basement half bathrooms
  • FullBath: Full bathrooms above grade
  • HalfBath: Half baths above grade
  • Bedroom: Number of bedrooms above basement level
  • Kitchen: Number of kitchens
  • KitchenQual: Kitchen quality
  • TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
  • Functional: Home functionality rating
  • Fireplaces: Number of fireplaces
  • FireplaceQu: Fireplace quality
  • GarageType: Garage location
  • GarageYrBlt: Year garage was built
  • GarageFinish: Interior finish of the garage
  • GarageCars: Size of garage in car capacity
  • GarageArea: Size of garage in square feet
  • GarageQual: Garage quality
  • GarageCond: Garage condition
  • PavedDrive: Paved driveway
  • WoodDeckSF: Wood deck area in square feet
  • OpenPorchSF: Open porch area in square feet
  • EnclosedPorch: Enclosed porch area in square feet
  • 3SsnPorch: Three season porch area in square feet
  • ScreenPorch: Screen porch area in square feet
  • PoolArea: Pool area in square feet
  • PoolQC: Pool quality
  • Fence: Fence quality
  • MiscFeature: Miscellaneous feature not covered in other categories
  • MiscVal: $Value of miscellaneous feature
  • MoSold: Month Sold
  • YrSold: Year Sold
  • SaleType: Type of sale
  • SaleCondition: Condition of sale

More about this data set can be found on Kaggle website.


In [ ]:
# read csv data
data = pd.read_csv('housing_train.csv')
# describe dataset
data.describe()

In [ ]:
# show first 5 records in the dataset

data.head()

In [ ]:
# show last 5 records in the dataset

data.tail()

In [239]:
# row selection from 10-15 record
dataTemp = data[0:15]

In [ ]:
# iteration over rows
for row in dataTemp.iterrows():
    print row[1]['SalePrice']

In [ ]:
data['Lambda'] = data['SalePrice'].apply(lambda x: x * 1.1)
dataTemp = data[0:15]
dataTemp[::3]

In [243]:
columns = ['SalePrice', 'LotArea', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'YrSold']
data = data[columns]

In [ ]:
plt.figure(figsize=(10, 5))
plt.hist(data['SalePrice'],normed=False)      
plt.show()

plt.figure(figsize=(10, 5))
plt.hist(data['LotArea'],normed=False)      
plt.show()

plt.figure(figsize=(10, 5))
plt.hist(data['BedroomAbvGr'],normed=False)      
plt.show()

In [ ]:
len(data['SalePrice'])

In [246]:
# Data filtering
dataFiltering = data[['SalePrice', 'BedroomAbvGr','LotArea']].copy()

In [ ]:
dataFiltering.head()

In [ ]:
# Hadling NaN values
original = pd.read_csv('housing_train.csv')
#original.isnull().any()
original.loc[:, original.isnull().any()]

In [ ]:
original.dropna(subset=["LotFrontage"])    # option 1

original.drop("LotFrontage", axis=1)       # option 2

median = housing["LotFrontage"].median()
original["LotFrontage"].fillna(median)     # option 3

In [ ]:
# Mention ~ operator
count = original[(original["MSZoning"].str.contains('RL'))]
len(count)

In [ ]:
# best fit of data
(mu, sigma) = norm.fit(data['SalePrice'])

# the histogram of the data
n, bins, patches = plt.hist(data['SalePrice'], 60, normed=True, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)

#plot
plt.xlabel('Sales prices')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma))
plt.grid(True)

plt.show()

In [ ]:
prices = data['SalePrice']
features = data.drop('SalePrice', axis = 1)

# i: index
for i, col in enumerate(features.columns):
    plt.figure(figsize=(20, 35))
    # 3 plots here hence 1, 3
    plt.subplot(5, 1, i+1)
    x = data[col]
    y = prices
    plt.plot(x, y, 'o')
    # Create regression line
    plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel('prices')
    plt.show()

In [ ]:
foot_to_meter_ratio = 0.092903
data['LotAream2']=data['LotArea'] * foot_to_meter_ratio
data['LotAream2'] = data['LotAream2'].round(0)
data.head()

In [ ]:
x = data['LotAream2']
y = prices
plt.figure(figsize=(20, 10))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()

In [ ]:
# Creating smaller data set and filter it
dataM2 = data[['SalePrice', 'LotAream2']].copy()
low = .05
high = .9
quant_df = dataM2.quantile([low, high])
print(quant_df)

In [258]:
dataM2 = dataM2.apply(lambda x: x[(x > quant_df.loc[low, x.name]) & (x < quant_df.loc[high, x.name])], axis=0)

In [ ]:
dataM2.head()

In [ ]:
len(dataM2['SalePrice'])

In [260]:
dataM2['BedroomAbvGr']=data['BedroomAbvGr'].copy()
dataM2.head()
dataM2 = dataM2.dropna()

In [ ]:
x = dataM2['LotAream2']
y = dataM2['SalePrice']
plt.figure(figsize=(20, 15))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()

In [ ]:
dataM2["BedroomAbvGr"].value_counts()

In [ ]:
color = [str(item*270/255.) for item in dataM2["BedroomAbvGr"]]
figure = plt.figure()
subplot = figure.add_subplot(111)
scatter = subplot.scatter(dataM2['LotAream2'], dataM2['SalePrice'], s=50, c=color)
subplot.set_xlabel('Lot in m2')
subplot.set_ylabel('Price')
plt.colorbar(scatter)
figure.set_figheight(10)
figure.set_figwidth(15)
plt.show()

In [170]:
# Correlation matrix
corr_matrix = data.corr()

In [ ]:
corr_matrix["SalePrice"].sort_values(ascending=False)

In [ ]:
attributes = ["SalePrice", "LotAream2", "BedroomAbvGr", "1stFlrSF", "2ndFlrSF"]
scatter_matrix(data[attributes], figsize=(15, 15)) 
data.plot(kind="scatter", x="LotAream2", y="SalePrice",alpha=0.1) 
plt.show()

In [173]:
# a bit more data filtering
df = data[['SalePrice', 'LotAream2', 'BedroomAbvGr']].copy()

In [ ]:
df.head()
len(df)

In [264]:
filtered = df.drop(
    df.index[(df['LotAream2'] > (df['LotAream2'].mean() + 3 * df['LotAream2'].std()))])

In [ ]:
x = filtered['LotAream2']
y = filtered['SalePrice']
plt.figure(figsize=(20, 15))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()

In [182]:
filtered.describe()

In [ ]:
data.describe()

In [ ]:
counts = filtered.groupby('BedroomAbvGr').size()
counts.head()

In [ ]:
filtered = df.drop(
    df.index[(df['LotAream2'] > (df['LotAream2'].mean() + 3 * df['LotAream2'].std()))])